
/**********************************************************************/
/*
Author: Michelle Han
Created: 30 September, 2022
   Description: Checks and cleans PMO data at person-batch level.
	 Code runs on merged PMO data at person-batch level

   Outputs:
	 pmo_b1-22_clean_long.dta

*/
/**********************************************************************/

  	* Set Filepaths
	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	* Log
	cap log close
	global prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_PMO_clean_long_b1-22.txt", replace text

/*---------------------------------------------------*/
                  /* Section: Checks */
/*----------------------------------------------------*/

	use "$KP_deid_admin/Raw/Merged/pmo_b1-22_raw_long.dta", clear

* Rename vars
	rename ticket_score bobot
	rename batch_sk_39 batch_treated_39
	rename batch_sk_47 batch_treated_47

* Check old and new has_passed_current_batch
	gen has_passed_current_batch_diff = 0
	replace has_passed_current_batch_diff = 1 if has_passed_current_batch != has_passed_current_batch_old ///
		& !missing(has_passed_current_batch) & !missing(has_passed_current_batch_old)
	tab has_passed_current_batch_diff

* if missing has_passed_current_batch from new data transfer, replace with old has_passed_current_batch variable
	replace has_passed_current_batch = has_passed_current_batch_old if missing(has_passed_current_batch)

* Shorten name of has_passed_current_batch 
	gen win_in_batch = has_passed_current_batch 
	
* count number of appearancess
	tab batch, m
	gsort anon_id4 batch
	by anon_id4: gen appearances = _N
	by anon_id4 (batch): gen last_appearance = _n == _N
	tab appearances if last_appearance, m
	gstats summ appearances if last_appearance

* check missings
	mdesc
	mdesc if old_ob_only

	assert anon_id4 != .
	assert batch != .
	assert win_in_batch != .
	assert !missing(bobot)

* check demographic vars are constant
	foreach var of varlist year_dob anon_month_dob gender education anon_hh_id final_test_score {
		gsort anon_id4 `var'
		by anon_id4 (`var'): gen tag_`var' = `var'[1] != `var'[_N] if appearances > 1
		tab tag_`var'
	}

* education -> years of schooling (from https://en.wikipedia.org/wiki/Education_in_Indonesia)
* https://sites.miis.edu/educationinindonesia/files/2013/12/Screenshot-17.png
	gen yrs_school = 6 if education == 1
	replace yrs_school = 9 if education == 2
	replace yrs_school = 12 if education == 3
	replace yrs_school = 13.5 if education == 4
	replace yrs_school = 15 if education == 5
	replace yrs_school = 16 if education == 6
	replace yrs_school = 18 if education == 7
	replace yrs_school = 21 if education == 8
	replace yrs_school = 21 if education == 9 // how many years of schooling is education = 9?

* list anon_id4 batch education if tag_education == 1
	tab tag_anon_hh_id gender, col

* check that if user gets accepted, no longer applies
* NOTE: all the problematic observations are in batches 1-3
	gsort anon_id4 batch
	by anon_id4 (batch): gen appearance_num = _n
	gen select_batch_not_last = appearance_num != appearances if win_in_batch == 1
	tab select_batch_not_last

	gsort anon_id4 batch
	by anon_id4: egen apply_after_selected = total(select_batch_not_last)
	tab apply_after_selected
	tab batch if apply_after_selected == 1

* list anon_id4 batch has_passed_current_batch if apply_after_selected == 1, clean noobs

* check number of individuals applying from same HH in the same batch
	gsort anon_hh_id batch
	by anon_hh_id batch: gen num_apply_in_batch_hh = _N if !mi(anon_hh_id)
	gen multi_apply_hh = num_apply_in_batch_hh > 1 & !mi(num_apply_in_batch_hh)

	preserve
	gsort anon_hh_id batch
	by anon_hh_id batch: gen ord = _n
	drop if ord > 1
	tab num_apply_in_batch_hh

	forval i = 1 / 22 {
		di "Batch: `i':"
		tab num_apply_in_batch_hh if batch == `i'
	}

	by anon_hh_id : replace ord = _n
	drop if ord > 1
	restore

** Batch wins
* does individual win by batch 3, 5, 11, 17, 18, 22, 39
	gsort anon_id4
	by anon_id4: gegen ever_win_22 = total(win_in_batch)
	by anon_id4: gegen ever_win_18 = total(win_in_batch) if batch >= 1 & batch <= 18
	by anon_id4: gegen ever_win_17 = total(win_in_batch) if batch >= 1 & batch <= 17
	by anon_id4: gegen ever_win_11 = total(win_in_batch) if batch >= 1 & batch <= 11
	by anon_id4: gegen ever_win_5 = total(win_in_batch) if batch >= 1 & batch <= 5
	by anon_id4: gegen ever_win_3 = total(win_in_batch) if batch >= 1 & batch <= 3

	gen ever_win_39 = 0
	replace ever_win_39 = 1 if !missing(batch_treated_39)

	gen ever_win_47 = 0 
	replace ever_win_47 = 1 if !missing(batch_treated_47)
	replace ever_win_47 = . if merge_t12 == 1
	tab ever_win_47

	by anon_id4 : gen ord = _n
	tab ever_win_22 if ord == 1
	tab ever_win_18 if ord == 1
	tab ever_win_17 if ord == 1
	tab ever_win_11 if ord == 1
	tab ever_win_5 if ord == 1
	tab ever_win_3 if ord == 1

* does HH ever win by batch 17
	gsort anon_hh_id
	by anon_hh_id : replace ord = _n
	by anon_hh_id: gegen hh_num_wins_11 = total(win_in_batch) if anon_prov_id != . & batch >= 1 & batch <= 11
	tab hh_num_wins_11 if ord == 1
	gen hh_ever_win_11 = hh_num_wins_11 > 0 if !missing(hh_num_wins_11)
	tab hh_ever_win_11 if ord == 1

* does HH ever win by batch 17
	gsort anon_hh_id
	by anon_hh_id : replace ord = _n
	by anon_hh_id: gegen hh_num_wins_17 = total(win_in_batch) if anon_prov_id != . & batch >= 1 & batch <= 17
	tab hh_num_wins_17 if ord == 1
	gen hh_ever_win_17 = hh_num_wins_17 > 0 if !missing(hh_num_wins_17)
	tab hh_ever_win_17 if ord == 1

* does HH ever win by batch 22
	gsort anon_hh_id
	by anon_hh_id : replace ord = _n
	by anon_hh_id: gegen hh_num_wins_22 = total(win_in_batch) if anon_prov_id != .
	tab hh_num_wins_22 if ord == 1
	gen hh_ever_win_22 = hh_num_wins_22 > 0 if !missing(hh_num_wins_22)
	tab hh_ever_win_22 if ord == 1

* is this the first batch individual applies in?
	gsort anon_id4
	by anon_id4: gegen first_apply_batch = min(batch)
	by anon_id4 : replace ord = _n
	gen first_apply = batch == first_apply_batch
	tab first_apply_batch if ord == 1
	tab batch first_apply, row

* does HH win in this batch?
	gsort anon_hh_id batch
	by anon_hh_id : replace ord = _n
	by anon_hh_id batch: gegen hh_wins_in_batch = total(win_in_batch) if anon_prov_id != .
	tab batch hh_wins_in_batch if ord == 1, row

	preserve
	gsort anon_hh_id batch
	by anon_hh_id batch: replace ord = _n
	keep if ord == 1
	tab batch hh_wins_in_batch
	restore

	gen hh_win_in_batch = hh_wins_in_batch > 0 if !missing(hh_wins_in_batch)
	tab batch hh_win_in_batch if ord == 1, row

* batch opens date
	gen batch_open 		  = td("11apr2020") if batch == 1
	replace batch_open 	= td("21apr2020") if batch == 2
	replace batch_open 	= td("27apr2020") if batch == 3
	replace batch_open 	= td("8aug2020")  if batch == 4
	replace batch_open 	= td("15aug2020") if batch == 5
	replace batch_open 	= td("27aug2020") if batch == 6
	replace batch_open 	= td("3sep2020")  if batch == 7
	replace batch_open 	= td("10sep2020") if batch == 8
	replace batch_open 	= td("17sep2020") if batch == 9
	replace batch_open 	= td("26sep2020") if batch == 10
	replace batch_open 	= td("2nov2020")  if batch == 11
	replace batch_open 	= td("23feb2021") if batch == 12
	replace batch_open 	= td("4mar2021")  if batch == 13
	replace batch_open 	= td("11mar2021") if batch == 14
	replace batch_open 	= td("18mar2021") if batch == 15
	replace batch_open 	= td("25mar2021") if batch == 16
	replace batch_open 	= td("5jun2021")  if batch == 17
	replace batch_open 	= td("16aug2021")  if batch == 18
	replace batch_open 	= td("26aug2021") if batch == 19
	replace batch_open 	= td("9sep2021") if batch == 20
	replace batch_open 	= td("16sep2021") if batch == 21
	replace batch_open 	= td("25oct2021")  if batch == 22

	gen batch_year = 2020 if batch <= 11
	replace batch_year = 2021 if batch >= 12

* generate strata
	gegen strata = group(bobot anon_prov_id batch)

* calculate strata win probability 
	tab anon_prov_id, m
	tab batch, m
	tab bobot, m
	tab has_passed_current_batch, m

	preserve
		collapse (mean) stratum_win_prob = has_passed_current_batch if !missing(anon_prov_id), by(batch anon_prov_id bobot)
		tempfile stratum_win_prob
		save `stratum_win_prob'
	restore 

	merge m:1 batch anon_prov_id bobot using `stratum_win_prob', keepusing(stratum_win_prob)

* Checks for strata win probability
	bysort batch anon_prov_id bobot (stratum_win_prob) : assert stratum_win_prob[1] == stratum_win_prob[_n] 
	summ stratum_win_prob
	mdesc stratum_win_prob

* Format date incentive 
	rename date_incentive rawtime 
	gen date_incentive = date(rawtime, "YMD")
	format date_incentive %td
	summ date_incentive 
	drop rawtime

* Label variables
	la var win_in_batch "Indicator for having passed current batch (updated PMO data, Sep '22)"
	la var batch_treated_39 "Batch that respondent was treated, up to batch 39 (updated PMO data, Sep '22)"
	la var batch_treated_47 "Batch that respondent was treated, up to batch 47 for control group survey respondents only (updated PMO data, July '23)"	
	la var date_batch "Date that applicant was announced winner of lottery (updated PMO data, Sep '22)"
	la var date_incentive" Date that applicant received cash transfer (updated PMO data, Sep '22)"
	la var status_revoked "Flag for having status revoked due to a) not registering for a course on time or b) received other social assistance programs(updated PMO data, Sep '22)"
	la var ever_win_3 "Ever Won Prakerja, by Batch 3 (updated PMO data, Sep '22)"
	la var ever_win_5 "Ever Won Prakerja, by Batch 5 (updated PMO data, Sep '22)"
	la var ever_win_11 "Ever Won Prakerja, by Batch 11 (updated PMO data, Sep '22)"
	la var ever_win_17 "Ever Won Prakerja, by Batch 17 (updated PMO data, Sep '22)"
	la var ever_win_18 "Ever Won Prakerja, by Batch 18 (updated PMO data, Sep '22)"
	la var ever_win_22 "Ever Won Prakerja, by Batch 22 (updated PMO data, Sep '22)"
	la var ever_win_39 "Ever Won Prakerja, by Batch 39 (updated PMO data, Sep '22)"
	la var first_apply_batch "Batch of First Application (updated PMO data, Sep '22)"
	la var year_dob "Year of Birth"
	la var anon_month_dob "Month of Birth"
	la var gender "Gender"
	la var final_test_score "Score on Skills Test"
	la var education "Education level"
	la var hh_ever_win_17 "Household Ever Won Prakerja, by Batch 17"
	la var hh_ever_win_22 "Household Ever Won Prakerja, by Batch 22"
	la var aaa1 "[For 2020 registrants] Are you unemployed at the moment?"
	la var aaa2 "[For 2020 registrants] Are you an employee?"
	la var aaa3 "[For 2020 registrants] Have you ever worked before?"
	la var aaa4 "[For 2020 registrants] Are you unemployed due to Covid-19?"
	la var aaa5 "[For 2020 registrants] Do you have job contract?"
	la var aaa6 "[For 2020 registrants] Are you self-employed?"
	la var aaa7 "[For 2020 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa8 "[For 2020 registrants] Is your business closed temporarily due to government advice?"
	la var aaa9 "[For 2020 registrants] Is your business closed temporarily because you cannot pay your employee?"
	la var aaa20 "[For 2021, semester 1 registrants] Are you unemployed at the moment?"
	la var aaa21 "[For 2021 semester 1 registrants] Are you an employee?"
	la var aaa22 "[For 2021 semester 1 registrants] Have you ever worked before?"
	la var aaa23 "[For 2021 semester 1 registrants] Are you self-employed?"
	la var aaa24 "[For 2021 semester 1 registrants] Do you have job contract?"
	la var aaa25 "[For 2021 semester 1 registrants] Are you unemployed due to Covid-19?"
	la var aaa26 "[For 2021 semester 1 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa27 "[For 2021 semester 1 registrants] Does your working hour decrease due to Covid-19?"
	la var aaa31 "[For 2021 semester 2 registrants] Are you unemployed at the moment?"
	la var aaa32 "[For 2021 semester 2 registrants] Are you an employee?"
	la var aaa33 "[For 2021 semester 2 registrants] Have you ever worked before?"
	la var aaa34 "[For 2021 semester 2 registrants] Are you self-employed?"
	la var aaa35 "[For 2021 semester 2 registrants] Do you have job contract?"
	la var aaa36 "[For 2021 semester 2 registrants] Are you unemployed due to Covid-19?"
	la var aaa37 "[For 2021 semester 2 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa38 "[For 2021 semester 2 registrants] Does your working hour decrease due to Covid-19?"
	la var anon_prov_id "Province ID"
	la var anon_hh_id "Household ID"
	la var old_ob_only "Indicator for observation only appearing in original PMO data transfer"
	la var new_ob_only "Indicator for observation only appearing in updated PMO data transfer (Sep '22)"
	la var match_add_var "Indicator for observation appearing in both original PMO data transfer and updated PMO data transfer (Sep '22)"
	la var batch_open "Date that batch opened"
	la var batch_year "Year that batch opened"
	la var age "Age when batch opened"
	la var hh_win_in_batch "Indicator for household winning in batch"
	la var batch "Batch"
	la var multi_apply_hh "Multiple people from household applied"
	la var strata "Strata (group weight batch province)"
	la var java "Dummy for being on Java"

* Save data
	gsort anon_id4 batch
	drop if apply_after_selected == 1
	drop tag_* appearance* select_batch_not_last ///
		ord has_passed_current_batch has_passed_current_batch_diff apply_after_selected hh_wins_in_batch ///
		 first_apply hh_num_wins_17 hh_num_wins_22 last_appearance
	order anon_id4 anon_hh_id batch gender win_in_batch
	compress
	datasignature 
if "`r(datasignature)'" == "58864738:72(35686):288457467:940895934" {
   save "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", replace
      }
else {
   di as err "Careful, your machine produces a different dataset"
   stop
		}

cap log close


// DONE
